In more than 100 samples, 35,500 reporters for 86 TFs were transfected into various cells (mainly mES, K562 & HepG2), and stimulated with various compounds. To be more precise, the data consists of two different libraries, library 1, which is more focused on mES biology, and library 2, which is more focused on classical signaling pathways. All the barcode counts from the 7 different sequencing runs are combined in this analysis. In a previous script (mt20211021_barcode_preprocessing… .Rmd), sanity checks were be carried out to filter out samples with bad quality, and barcode counts were be normalized to compute reporter activitites. In this script, plots will be generated to characterize the TF reporter activities and models will be built to extract the importance of reporter features.
knitr::opts_chunk$set(echo = TRUE)
StartTime <-Sys.time()
# 8-digit Date tag:
Date <- substr(gsub("-","",Sys.time()),1,8)
# libraries:
library(RColorBrewer)
library(ggplot2)
library(dplyr)
library(maditr)
library(tibble)
library(pheatmap)
library(ggpubr)
library(ggbeeswarm)
library(ggforce)
library(viridis)
library(plyr)
library(cowplot)
library(gridExtra)
library(pROC)
library(tidyr)
library(stringr)
library(randomForest)### Custom functions
SetFileName <- function(filename, initials) {
# Set filename with extension and initials to make filename with date integrated.
filename <- substitute(filename)
initials <- substitute(initials)
filename <- paste0(initials, Date, filename)
filename
}
# Set custom ggplot2 theme and custom colors
theme_classic_lines <- function() {
theme_pubr(border = F, legend = "top") +
theme(panel.grid.major = element_line(colour = "#adb5bd", size = 0.25),
strip.background = element_rect(fill = "#ced4da")
)
}
theme_classic_lines_45 <- function() {
theme_pubr(border = T, legend = "top", x.text.angle = 45) +
theme(panel.grid.major = element_line(colour = "#adb5bd", size = 0.25),
strip.background = element_rect(fill = "#ced4da")
)
}
theme_classic_lines_90 <- function() {
theme_pubr(border = T, legend = "top", x.text.angle = 90) +
theme(panel.grid.major = element_line(colour = "#adb5bd", size = 0.25),
strip.background = element_rect(fill = "#ced4da")
)
}
theme_set(theme_classic_lines())
colors_diverse <- c("#264653", "#9AC1AE", "#5D987B", "#f2cc8f", "#e76f51")
ggplot_custom <- function(...) ggplot2::ggplot(...) +
scale_color_manual(values = colors_diverse) +
scale_fill_manual(values = colors_diverse)# Import processed bc counts from the preprocessing step
cDNA_df <- read.csv("/DATA/usr/m.trauernicht/projects/SuRE-TF/data/gcf6578_gen2_stimulation-2/results/mt20211118_reporter_activity_filt_combined.csv", header = T) %>%
dplyr::select(-tf_activity, -reporter_id2, 'pval_minP' = pval, -pval_adj, -mutated_activity)
# Filter TP53 from library 1+2 K562 data (we previously determined that we should filter these data)
cDNA_df2 <- cDNA_df %>%
filter(library == "1+2" & condition == "K562_DMSO" & tf == "TP53")
cDNA_df <- anti_join(cDNA_df, cDNA_df2, by = c("reporter_id", "sample_id"))Display reporter activities relative to background activity per tf and condition
# Calculate fdr from the previously calculated p-values to correct for multiple hypotheses
cDNA_df$pval_adj_minP <- p.adjust(cDNA_df$pval_minP, method = 'fdr')
# Plot the reporter activities over the minimal promoter only and the adjusted p-value for each reporter
for (i in unique(cDNA_df$condition)) {
p <- ggplot_custom(cDNA_df %>%
filter(condition == i, neg_ctrls == "No") %>%
dplyr::select(reporter_id, pval_adj_minP, reporter_activity_minP, promoter) %>%
unique(),
aes(x = log2(reporter_activity_minP), y = -log10(pval_adj_minP),
color = ifelse(log2(reporter_activity_minP) > 1 & pval_adj_minP < 0.05, "green",
ifelse(log2(reporter_activity_minP) < -1 & pval_adj_minP < 0.05, "yellow","black")))) +
geom_point() +
geom_hline(yintercept = -log10(0.05), linetype = "dashed") +
geom_vline(xintercept = log2(2), linetype = "dashed") +
geom_vline(xintercept = -log2(2), linetype = "dashed") +
facet_wrap(~promoter) +
ggtitle(paste('reporter activity over minimal promoter only, condition:', i))
print(p)
}# Same plot for one condition only
ggplot_custom(cDNA_df %>%
filter(condition == "HepG2_CDCA", neg_ctrls == "No") %>%
dplyr::select(reporter_id, pval_adj_minP, reporter_activity_minP, promoter) %>%
unique() %>%
mutate(reporter_activity_minP = log2(reporter_activity_minP),
pval_adj_minP = -log10(pval_adj_minP)),
aes(x = reporter_activity_minP, y = pval_adj_minP,
color = ifelse(reporter_activity_minP > 1 & pval_adj_minP > -log10(0.05), ifelse(reporter_activity_minP > 3 & pval_adj_minP > -log10(0.05), "green", "blue"),
ifelse(reporter_activity_minP < -1 & pval_adj_minP > -log10(0.05), "yellow","black")))) +
geom_point() +
geom_hline(yintercept = -log10(0.05), linetype = "dashed") +
geom_vline(xintercept = log2(2), linetype = "dashed") +
geom_vline(xintercept = -log2(2), linetype = "dashed") +
facet_wrap(~promoter) +
ggtitle('reporter activity over minimal promoter only, HepG2_CDCA')# Same plot for one condition, but now plotting reporter_activity_neg and its significance
ggplot_custom(cDNA_df %>%
filter(condition == "HepG2_CDCA", neg_ctrls == "No", library == "1+2") %>%
dplyr::select(reporter_id, pval_neg, reporter_activity_neg, promoter) %>%
mutate(pval_neg = p.adjust(pval_neg, method = 'fdr')) %>%
unique(),
aes(x = log2(reporter_activity_neg), y = -log10(pval_neg),
color = ifelse(log2(reporter_activity_neg) > 1 & pval_neg < 0.05, "green",
ifelse(log2(reporter_activity_neg) < -1 & pval_neg < 0.05, "yellow","black")))) +
geom_point() +
geom_hline(yintercept = -log10(0.05), linetype = "dashed") +
geom_vline(xintercept = log2(2), linetype = "dashed") +
geom_vline(xintercept = -log2(2), linetype = "dashed") +
facet_wrap(~promoter) +
ggtitle('reporter activity over paired negative control, HepG2_CDCA')## Warning: Removed 884 rows containing missing values (geom_point).
# Stratify reporters by their activity
cDNA_df$reporter_activity_class <- 0
cDNA_df$reporter_activity_class[cDNA_df$pval_adj_minP < 0.05 & cDNA_df$reporter_activity_minP > 2] <- 1 # significantly active = significant 2-fold increase
cDNA_df$reporter_activity_class[cDNA_df$pval_adj_minP < 0.05 & cDNA_df$reporter_activity_minP > 8] <- 2 # significantly highly active = significant 8-fold increase
cDNA_df$reporter_activity_class[cDNA_df$pval_adj_minP < 0.05 & cDNA_df$reporter_activity_minP < 0.5] <- -1 # significantly repressive = significant 2-fold decrease
## Sub-select relevant data
cDNA_df2 <- cDNA_df %>%
filter(neg_ctrls == "No", commercial_reporter == "No") %>%
dplyr::select(reporter_id, tf, reporter_activity_class, condition, promoter) %>%
unique()
# Calculate per condition how many reporters per TF are upregulated, downregulated, or unchanged
cDNA_df2 <- cDNA_df2 %>%
mutate(class_count = ave(reporter_activity_class, tf, promoter, condition, FUN = function(x) sum(x)))
for (i in unique(cDNA_df2$condition)) {
p <- ggplot_custom(cDNA_df2 %>%
filter(condition == i),
aes(x = reorder(tf, -class_count), fill = factor(reporter_activity_class, levels = c("0", "1", "2", "-1")))) +
geom_bar(position = "fill") +
geom_hline(yintercept = 0.25, linetype = "dashed") +
geom_hline(yintercept = 0.75, linetype = "dashed") +
theme_classic_lines_90() +
facet_wrap(~promoter) +
xlab("TF") +
ggtitle(paste('fraction of active/repressive reporters, condition:',i))
print(p)
}## Same plot but for one condition
ggplot_custom(cDNA_df2 %>%
filter(condition == "mES_2i_LIF", promoter == "mCMV"),
aes(x = reorder(tf, -class_count), fill = factor(reporter_activity_class, levels = c("0", "1", "2", "-1")))) +
geom_bar(position = "fill") +
geom_hline(yintercept = 0.25, linetype = "dashed") +
geom_hline(yintercept = 0.75, linetype = "dashed") +
theme_classic_lines_90()+
xlab("TF") +
ggtitle('fraction of active/repressive reporters, mES_2i_LIF - mCMV')How many reporters are classified as active/repressive per tf and condition?
# Divide TFs per condition into different classes: activators, selective activator, inactive, selective repressor, repressor
## Determine amount of active/repressive reporters per tf
cDNA_df3 <- cDNA_df2 %>%
mutate(class_count2 = ave(reporter_activity_class, tf, condition, FUN = function(x) length(x)),
class_count_rel = ave(reporter_activity_class, reporter_activity_class, tf, condition, FUN = function(x) length(x))) %>%
mutate(class_count_rel = class_count_rel / class_count2) %>%
dplyr::select(tf, condition, class_count_rel, reporter_activity_class) %>%
unique() %>%
spread(reporter_activity_class, class_count_rel)
cDNA_df3$class[cDNA_df3$`0` >= 0.75] <- 0 # inactive = >75% of the reporters are inactive
cDNA_df3$class[cDNA_df3$`1` >= 0.75] <- 2 # active = >75% of the reporters are active
cDNA_df3$class[cDNA_df3$`2` >= 0.75] <- 4 # highly active = >75% of the reporters are highly active
cDNA_df3$class[cDNA_df3$`-1` >= 0.75] <- -4 # repressive = >75% of the reporters are repressive
cDNA_df3$class[cDNA_df3$`1` >= 0.25 & cDNA_df3$`1` < 0.75] <- 1 # partly active = >25% & <75% of the reporters are active
cDNA_df3$class[cDNA_df3$`1` < 0.25 & cDNA_df3$`2` < 0.25 & cDNA_df3$`1` + cDNA_df3$`2` >= 0.25] <- 1 # partly active = >25% of the reporters are active or highly active
cDNA_df3$class[cDNA_df3$`2` >= 0.25 & cDNA_df3$`2` < 0.75] <- 3 # partly highly active = >25% of the reporters are highly active
cDNA_df3$class[cDNA_df3$`-1` >= 0.25 & cDNA_df3$`-1` < 0.75] <- -2 # partly repressive = >25% & <75% of the reporters are repressive
cDNA_df3$class[cDNA_df3$`1` >= 0.085 & cDNA_df3$`-1` >= 0.085] <- -1 # mixed active/repressive = > 8.5% are active and >8.5% are repressive
## Sub-select relevant columns, for now only focus on the conditions where I have robust data from both libraries
cDNA_df4 <- cDNA_df3 %>%
dplyr::select(tf, condition, class) %>%
unique() %>%
na.omit() %>%
filter(condition %in% c("HepG2_CDCA", "HepG2_DMSO", "K562_DMSO", "mES_2i",
"mES_2i_LIF", "mES_HQ", "mES_LIF_CH", "mES_LIF_PD",
"mES_N2B27")) %>%
mutate(sum = ave(class, tf, FUN = function(x) sum(x))) %>%
filter(sum != 0) %>%
dplyr::select(-sum)
cDNA_df4 <- cDNA_df4 %>%
spread(tf, class) %>%
column_to_rownames('condition')
## Make custom color palette
makeColorRampPalette <- function(colors, cutoff.fraction, num.colors.in.palette)
{
stopifnot(length(colors) == 4)
ramp1 <- colorRampPalette(colors[1:2])(num.colors.in.palette * cutoff.fraction)
ramp2 <- colorRampPalette(colors[2:4])(num.colors.in.palette * (1 - cutoff.fraction))
return(c(ramp1, ramp2))
}
cols <- makeColorRampPalette(c(colors_diverse[4], colors_diverse[1], colors_diverse[2], colors_diverse[3]),
1-(4/6),
100)
## Plot heatmap showing fraction of active reporters
pheatmap(as.matrix(cDNA_df4),
color = cols,
border_color = "white",
cellwidth = 8, cellheight = 8,
angle_col = 90,
main = "fraction of active/repressive reporters per tf")Why are the reporters classified as repressive/activating? Fit linear model to reporter classification
# Sub-select relevant data
cDNA_df_lm <- cDNA_df %>%
filter(neg_ctrls == "No", commercial_reporter == "No", promoter != "Random",
condition %in% c("HepG2_CDCA", "HepG2_DMSO", "K562_DMSO", "mES_2i",
"mES_2i_LIF", "mES_HQ", "mES_LIF_CH", "mES_LIF_PD",
"mES_N2B27"
),
tf %in% names(cDNA_df4)) %>%
mutate(background = as.factor(background)) %>%
dplyr::select(reporter_id, tf, reporter_activity_class, condition, promoter, spacing, background, distance) %>%
unique()
cDNA_df3[is.na(cDNA_df3)] <- 0
# Only select tfs for which differences in reporter activity were observed (for the other ones it doesn't make sense to fit models)
mixed_classes <- cDNA_df3 %>%
mutate(id = paste(condition, tf, sep = "_")) %>%
dplyr::select(-class) %>%
#filter(class != 0) %>%
unique() %>%
filter(`1` < 0.95, `0` < 0.95, `2` < 0.95)
# Select only relevant tfs from source data frame and only include ids with more than two entries
cDNA_df_lm$id <- paste(cDNA_df_lm$condition, cDNA_df_lm$tf, sep = "_")
cDNA_df_lm <- cDNA_df_lm %>%
mutate(sum = ave(id, id, FUN = function(x) length(x))) %>%
filter(sum >= 2, id %in% mixed_classes$id) %>%
dplyr::select(-sum)
# Fit linear model for each id (combination of condition and tf)
lm_features <- lapply(unique(cDNA_df_lm$id), function(x) lm(reporter_activity_class ~ promoter + spacing + background + distance,
cDNA_df_lm[cDNA_df_lm$id == x,]))
names(lm_features) <- unique(cDNA_df_lm$id)
## Extract and plot weights from linear model
lm_weights <- list()
for (i in names(lm_features)) {
lm_weights[[i]] <- lm_features[[i]]$coefficients
}
lm_weights <- bind_rows(lm_weights, .id = "id")
lm_weights <- lm_weights %>%
dplyr::select(-`(Intercept)`) %>%
pivot_longer(-id, names_to = "feature", values_to = "weight")
ggplot_custom(lm_weights, aes(x = feature, y = weight)) +
geom_quasirandom() +
theme_classic_lines_90() +
ggtitle('weights of all models')## Warning: Removed 279 rows containing missing values (position_quasirandom).
## Extract and plot features from linear model
exp_features <- list()
for (i in names(lm_features)) {
exp_features[[i]] <- anova(lm_features[[i]]) %>%
dplyr::select('sum_sq' = `Sum Sq`) %>%
na.omit() %>%
mutate(sum = sum(sum_sq)) %>%
mutate(rel_sum_sq = (sum_sq/sum)*100) %>%
setnames("rel_sum_sq", "percent") %>%
dplyr::select("percent")
}
exp_features <- bind_rows(exp_features, .id = "id") %>%
rownames_to_column("feature") %>%
mutate(feature = gsub("...[0-9]{1-3}", "", feature)) %>%
mutate(feature = gsub("[0-9]", "", feature)) %>%
mutate(percent = format(percent, scientific = F, digits = 0)) %>%
mutate(percent = as.numeric(percent))
exp_features$group <- "no residual"
exp_features$group[grep("Residual", exp_features$feature)] <- "residual"
exp_features$percent[exp_features$feature == "Residuals"] <- 100 - exp_features$percent[exp_features$feature == "Residuals"]
exp_features$feature <- revalue(exp_features$feature, c("Residuals"="total"))
ggplot_custom(exp_features,
aes(x = percent, y = reorder(feature, -percent), fill = group)) +
geom_bar(stat = "identity") +
xlab("Variance explained (%)") +
ylab("") +
ggtitle('sum of features of all models')## Plot features separately per tf activity class
classes <- cDNA_df3 %>%
mutate(id = paste(condition, tf, sep = "_")) %>%
dplyr::select(id, class) %>%
unique()
exp_features <- merge(exp_features, classes, by = "id")
ggplot_custom(exp_features,
aes(y = percent, x = feature, color = group)) +
geom_quasirandom() +
xlab("Variance explained (%)") +
ylab("") +
facet_wrap(~class) +
theme_classic_lines_90() +
ggtitle('features of all models per tf activity class')## Discard poor performing linear models (<50% of variance explained)
exp_features_good <- exp_features %>%
filter(group == "residual", percent >= 50)
exp_features_sel <- exp_features[exp_features$id %in% exp_features_good$id,]
exp_features_sel <- exp_features_sel %>%
mutate(tf = gsub(".*_(.*)", "\\1", id),
condition = gsub("(.*)_.*", "\\1", id))
## Plot again the features
ggplot_custom(exp_features_sel,
aes(y = percent, x = feature, color = group)) +
geom_quasirandom() +
xlab("Variance explained (%)") +
ylab("") +
ggtitle('features of all models - only good models')## Plot features individually per id
### Order the ids
order <- exp_features_sel %>%
mutate(sum = ave(percent, id, FUN = function(x) sum(x))) %>%
dplyr::select(sum, tf) %>%
unique() %>%
group_by(tf) %>%
top_n(1, sum)
exp_features_sel_ord <- merge(order, exp_features_sel, by = "tf") %>%
filter(feature != "total")
### As bar plot
ggplot(exp_features_sel_ord %>%
filter(condition %in% c("K562_DMSO", "HepG2_DMSO", "mES_2i_LIF")),
aes(y = percent, x = reorder(id, -sum), fill = feature, group = feature)) +
geom_bar(stat = "identity", position = "stack") +
xlab("Variance explained (%)") +
ylab("") +
scale_fill_manual(values = c("#e07a5f", "#3d405b", "#81b29a", "#f2cc8f"))+
theme_classic_lines_90() +
ggtitle('feature importance per tf & condition')### As connected line-plot
ggplot_custom(exp_features_sel, aes(x=feature, y=percent, color=group, group = id)) +
geom_line() +
geom_point()+
ggtitle('feature importance per tf & condition')### Only select the three main cell lines to simplify
exp_features_sel_heatmap <- exp_features_sel %>%
filter(condition %in% c("mES_2i_LIF", "K562_DMSO", "HepG2_DMSO")) %>%
mutate(cell_type = gsub("(.*)_.*", "\\1", condition)) %>%
mutate(cell_type = gsub("(.*)_.*", "\\1", cell_type)) %>%
dplyr::select(-group, -condition) %>%
filter(feature != "total") %>%
spread(feature, percent) %>%
column_to_rownames("id") #%>%
#arrange(tf)
cell_type <- exp_features_sel_heatmap %>%
dplyr::select(cell_type)
colors_pheatmap <- list('cell_type' = c("K562" = colors_diverse[1], "HepG2" = colors_diverse[5], "mES" = colors_diverse[4]))
pheatmap(as.matrix(t(exp_features_sel_heatmap %>% dplyr::select(-cell_type, -tf, -class))),
color = colorRampPalette(brewer.pal(n = 7, name = "Greys"))(100),
border_color = "white",
cellwidth = 8, cellheight = 8,
angle_col = 90, annotation_col = cell_type, cluster_cols = T,
annotation_colors = colors_pheatmap,
main = 'feature importance per tf & condition')Can we select the best reporter for each TF?
# Select conditions for which I have robust data, remove highly active reporters that I took over from library 1 and included in library 2 (because those distort the activity distributions)
cDNA_df2 <- cDNA_df %>%
filter(condition %in% c("HepG2_DMSO", "K562_DMSO", "mES_2i_LIF", "mES_2i", "mES_N2B27", "mES_LIF_PD", "mES_LIF_CH"), library != "1+2")
tfs_lib1 <- cDNA_df %>%
filter(library == "1") %>%
dplyr::select(tf) %>%
unique()
cDNA_df3 <- cDNA_df %>%
filter(condition %in% c("HepG2_DMSO", "K562_DMSO", "mES_2i_LIF", "mES_2i", "mES_N2B27", "mES_LIF_PD", "mES_LIF_CH"), library == "1+2")
cDNA_df3 <- cDNA_df3[!cDNA_df3$tf %in% tfs_lib1$tf,]
cDNA_df<- rbind(cDNA_df2, cDNA_df3)
# Explore the ids for which I was able to fit a good linear model
ids_selected <- cDNA_df_lm %>%
filter(id %in% rownames(exp_features_sel_heatmap)) %>%
mutate(reporter_activity_class = as.character(reporter_activity_class),
sum_all = ave(id, id, FUN = function(x) length(x)),
sum_class = ave(id, id, reporter_activity_class, FUN = function(x) length(x))) %>%
mutate(sum_all = as.numeric(sum_all),
sum_class = as.numeric(sum_class),
frac = sum_class / sum_all)
## Compare for some example tfs the activity fraction in different conditions
reporter_correlations <- cDNA_df %>%
filter(tf %in% c("GRHL1", "POU5F1::SOX2", "MAF::NFE2"),
condition %in% c("mES_2i_LIF", "K562_DMSO", "HepG2_DMSO")) %>%
dplyr::select(reporter_id, condition, tf, pval_adj_minP) %>%
mutate(pval_adj_minP = -log10(pval_adj_minP)) %>%
unique() %>%
spread(condition, pval_adj_minP)
ggplot_custom(reporter_correlations, aes(x = K562_DMSO, y = mES_2i_LIF)) +
geom_point() +
facet_wrap(~tf, scales = "free")Comparison with TF expression and MPRA-inferred TF activities (Ernst et al.)
# Extract TF activities for K562 & HepG2
tf_activities <- cDNA_df %>%
filter(condition %in% c("K562_DMSO", "HepG2_DMSO"), neg_ctrls == "No") %>%
dplyr::select(reporter_id, reporter_activity_minP, condition, tf) %>%
unique() %>%
mutate(tf = gsub("_.*", "", tf)) %>%
spread(condition, reporter_activity_minP) %>%
mutate(dif = log2(K562_DMSO / HepG2_DMSO)) %>%
dplyr::select(tf, dif, reporter_id)
#write_csv2(tf_activities, "/DATA/usr/m.trauernicht/projects/SuRE-TF/ATAC_seq_hepg2/diffTF/tf_reporter_activities_all.csv")
# Correlate with RNA-seq
load('/DATA/usr/m.martinez.ara/GitLab/gurten/gurten/data/fc181121_epsure_tadec_rnaseq_expr_joshi.RData')
# remove Serum data and calculate mean expression
tib_fpkm_joshi$twoi <- (tib_fpkm_joshi$twoi_1 + tib_fpkm_joshi$twoi_2)/2
tib_fpkm_joshi <- tib_fpkm_joshi %>%
dplyr::select(gene_id, 'tf' = symbol, twoi)
tf_activities2 <- cDNA_df %>%
filter(condition == "mES_2i_LIF", neg_ctrls == "No") %>%
dplyr::select(reporter_id, reporter_activity_minP, tf) %>%
unique() %>%
mutate(tf = gsub("_.*", "", tf),
tf_activity = ave(reporter_activity_minP, tf, FUN = function(x) mean(x))) %>%
dplyr::select(tf_activity, tf) %>%
unique()
tf_activities2 <- merge(tf_activities2, tib_fpkm_joshi, by = "tf") %>%
mutate(tf_activity = (tf_activity-mean(tf_activity))/sd(tf_activity),
twoi = (twoi-mean(twoi))/sd(twoi))
ggplot_custom(tf_activities2, aes(x = twoi, y = tf_activity)) +
geom_point() +
ggtitle('TF reporter activities vs. TF expression in mESCs')# Correlate with regulon (VIPER, Aerts)
# Correlate with TF activities from Ernst et al. (https://doi.org/10.1038/nbt.3678)
ernst_activities <- read.table("/DATA/usr/m.trauernicht/projects/SuRE-TF/data/ernst_tf_activities_hepg2_k562.csv", sep = ";", dec = ",", header = T, fill = T)
ernst_activities <- ernst_activities %>%
dplyr::select('tf' = TF, activity_enrichment_HepG2, activity_enrichment_K562) %>%
mutate(avg_score_HepG2 = ave(activity_enrichment_HepG2, tf, FUN = function(x) mean(x)),
avg_score_K562 = ave(activity_enrichment_K562, tf, FUN = function(x) mean(x))) %>%
unique() %>%
pivot_longer(-tf, names_to = "condition", values_to = "ernst_activity") %>%
mutate(condition = gsub("avg_score_", "", condition))
tf_activities <- cDNA_df %>%
filter(condition %in% c("K562_DMSO", "HepG2_DMSO"), neg_ctrls == "No") %>%
dplyr::select(reporter_activity_minP, condition, tf) %>%
mutate(tf = gsub("_.*", "", tf),
condition = gsub("_DMSO", "", condition),
reporter_activity_minP = ave(reporter_activity_minP, condition, tf, FUN = function(x) mean(x))) %>%
unique()
ernst_activities <- merge(ernst_activities, tf_activities, by = c('tf', 'condition')) %>%
mutate(ernst_activity = (ernst_activity-mean(ernst_activity))/sd(ernst_activity),
reporter_activity_minP = (reporter_activity_minP-mean(reporter_activity_minP))/sd(reporter_activity_minP))
ggplot_custom(ernst_activities,
aes(x = ernst_activity, y = reporter_activity_minP)) +
geom_point() +
geom_abline(linetype = "dashed") +
geom_smooth(method = "lm", color = "grey", fill = "lightgrey") +
facet_wrap(~condition) +
ggtitle('TF reporter activities vs. MPRA-chunk-inferred TF activities')## `geom_smooth()` using formula 'y ~ x'
#write_csv2(ernst_activities, "/DATA/usr/m.trauernicht/projects/SuRE-TF/ATAC_seq_hepg2/diffTF/tf_reporter_activities_ernst.csv")display actual activity values per condition and TF instead of fraction of active reporters per TF
## Prepare dataframe
# Caculate mean TF activity per condition
tf_activity_heatmap <- cDNA_df %>%
filter(neg_ctrls == "No", commercial_reporter == "No") %>%
mutate(tf_activity = ave(reporter_activity_minP, condition, tf, FUN = function(x) mean(x))) %>%
#mutate(tf_activity = log2(tf_activity)) %>%
#mutate(tf_activity = ave(tf_activity, condition, FUN = function(x) (x-mean(x))/sd(x))) %>%
dplyr::select(tf_activity, condition, tf) %>%
unique() %>%
spread(condition, tf_activity) %>%
remove_rownames %>%
column_to_rownames(var="tf")
# Keeping the scale in the pheatmap function
myBreaks1 <- seq(0,6,0.06)
# Keep only TFs that are in at least one condition above a certain threshold
tf_activity_heatmap <- tf_activity_heatmap[rowSums(tf_activity_heatmap > 4) >= 1,] %>%
na.omit()
#tf_activity_heatmap <- tf_activity_heatmap[,c(2,1,6,7,3,8,4,5)]
# pheatmap function
pheatmap(as.matrix(t(tf_activity_heatmap)),
color = colorRampPalette(brewer.pal(n = 7, name = "Greys"))(100),
breaks = myBreaks1, border_color = "black",
cellwidth = 8, cellheight = 8,
angle_col = 90,
main = 'reporter activities per tf and condition') This heatmap doesn’t take significance into consideration and to me is less representative of tf activity than the fraction of reporter activity classes heatmap
paste("Run time: ",format(Sys.time()-StartTime))## [1] "Run time: 2.488774 mins"
getwd()## [1] "/DATA/usr/m.trauernicht/projects/SuRE-TF/analyses"
date()## [1] "Tue Dec 21 12:37:51 2021"
sessionInfo()## R version 4.0.5 (2021-03-31)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.3 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3
## LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/liblapack.so.3
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] randomForest_4.6-14 stringr_1.4.0 tidyr_1.1.3
## [4] pROC_1.17.0.1 gridExtra_2.3 cowplot_1.1.1
## [7] plyr_1.8.6 viridis_0.6.1 viridisLite_0.4.0
## [10] ggforce_0.3.3 ggbeeswarm_0.6.0 ggpubr_0.4.0
## [13] pheatmap_1.0.12 tibble_3.1.2 maditr_0.8.2
## [16] dplyr_1.0.7 ggplot2_3.3.5 RColorBrewer_1.1-2
##
## loaded via a namespace (and not attached):
## [1] sass_0.4.0 jsonlite_1.7.2 splines_4.0.5 carData_3.0-4
## [5] bslib_0.2.5.1 assertthat_0.2.1 highr_0.9 vipor_0.4.5
## [9] cellranger_1.1.0 yaml_2.2.1 lattice_0.20-41 pillar_1.6.1
## [13] backports_1.2.1 glue_1.4.2 digest_0.6.27 ggsignif_0.6.2
## [17] polyclip_1.10-0 colorspace_2.0-2 Matrix_1.3-2 htmltools_0.5.1.1
## [21] pkgconfig_2.0.3 broom_0.7.8 haven_2.4.1 purrr_0.3.4
## [25] scales_1.1.1 tweenr_1.0.2 openxlsx_4.2.4 rio_0.5.27
## [29] mgcv_1.8-34 generics_0.1.0 farver_2.1.0 car_3.0-11
## [33] ellipsis_0.3.2 withr_2.4.2 magrittr_2.0.1 crayon_1.4.1
## [37] readxl_1.3.1 evaluate_0.14 fansi_0.5.0 nlme_3.1-152
## [41] MASS_7.3-53.1 rstatix_0.7.0 forcats_0.5.1 foreign_0.8-81
## [45] beeswarm_0.4.0 tools_4.0.5 data.table_1.14.0 hms_1.1.0
## [49] lifecycle_1.0.0 munsell_0.5.0 zip_2.2.0 compiler_4.0.5
## [53] jquerylib_0.1.4 rlang_0.4.11 grid_4.0.5 labeling_0.4.2
## [57] rmarkdown_2.11 gtable_0.3.0 abind_1.4-5 DBI_1.1.1
## [61] curl_4.3.2 R6_2.5.0 knitr_1.33 utf8_1.2.1
## [65] stringi_1.7.2 Rcpp_1.0.7 vctrs_0.3.8 tidyselect_1.1.1
## [69] xfun_0.24